import pandas as pd
import numpy as np
from pyproj import Proj, transform
from pyproj import Transformer
import matplotlib.pyplot as plt
import folium
from IPython.display import display
from matplotlib import cm
import seaborn as sns
from matplotlib.ticker import PercentFormatter
from scipy.stats import ttest_ind
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
TRAN_2193_TO_4326 = Transformer.from_crs("EPSG:2193", "EPSG:4326")
def transform_coord(lat, lon):
return TRAN_2193_TO_4326.transform(lat, lon)
raw_df = pd.read_csv("/Users/yasirmuhammad/Downloads/Crash_Analysis_System_(CAS)_data.csv")
raw_df.head(10)
df = raw_df[['X','Y','OBJECTID','bicycle','bus','carStationWagon','crashDirectionDescription','crashLocation1','crashSeverity','crashYear','fatalCount','flatHill','holiday',
'light','minorInjuryCount','moped','motorcycle','region','speedLimit','suv','taxi','train','truck','tree','weatherA']]
cols = ['bicycle','bus','carStationWagon','fatalCount','minorInjuryCount','moped','motorcycle','suv','taxi','train','truck','tree']
df[cols] = df[cols].replace(np.nan,0)
df[cols].replace(np.nan,0)
df['holiday'] = df['holiday'].replace(np.nan,'NPH')
df['latitude'] = transform_coord(df['Y'],df['X'])[0]
df['longitude'] = transform_coord( df['Y'],df['X'])[1]
df.drop(['X','Y'],axis=1,inplace=True)
df['vehicleInvolved'] = df['suv'] + df['taxi'] + df['carStationWagon'] + df['truck'] + df['bus']
df['Time Category'] = np.where(df['crashYear']<2010,"2000s","2010s")
df.groupby('crashYear')['OBJECTID'].count().plot(kind='bar',figsize=(12,7), color='blue', alpha=0.5)
plt.title('No of accidents in last 21 years',fontsize=20)
plt.ylabel('Number of accidents',fontsize=16)
plt.show()
plt.subplot(1,2,1)
df.groupby('crashYear')['fatalCount'].sum().plot(kind='bar',figsize=(15,7), color='red', alpha=0.5)
plt.title('Fatalities in last 21 years',fontsize=20)
plt.ylabel('Fatalities',fontsize=16)
plt.subplot(1,2,2)
df.groupby('crashYear')['minorInjuryCount'].sum().plot(kind='bar',figsize=(15,7), color='blue', alpha=0.5)
plt.title('Injuries in last 21 years',fontsize=20)
plt.ylabel('No of Injuries',fontsize=16)
plt.show()
mean_2000s = df[(df['Time Category']=='2000s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].mean()
mean_2010s = df[(df['Time Category']=='2010s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].mean()
t_result = ttest_ind(df[(df['Time Category']=='2000s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'], df[(df['Time Category']=='2010s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'],equal_var=False)
print(f"The mean of 2000s era is {mean_2000s}, mean of 2010s era is {mean_2010s}, and t-test is {t_result}")
mean_2000s_inj = df[(df['Time Category']=='2000s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'].mean()
mean_2010s_inj = df[(df['Time Category']=='2010s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'].mean()
t_result_inj = ttest_ind(df[(df['Time Category']=='2000s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'], df[(df['Time Category']=='2010s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'],equal_var=False)
print(f"The mean of 2000s era is {mean_2000s_inj}, mean of 2010s era is {mean_2010s_inj}, and t-test is {t_result_inj}")
df2 = df[df['crashSeverity']=='Fatal Crash']
nz_map = folium.Map(location=[-40.9006,174.8860], zoom_start=6)
for lat, lng, label in zip(df2.latitude, df2.longitude, df2.fatalCount.astype(str)):
if label!='0':
folium.features.CircleMarker(
[lat, lng],
radius=3,
color='red',
fill=True,
popup=label,
fill_color='darkred',
fill_opacity=0.6
).add_to(nz_map)
nz_map